{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0.0125 0.025  0.05   0.1    0.2    0.4    0.8   ]\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "from sklearn.neural_network import MLPClassifier\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import roc_auc_score\n",
    "\n",
    "rates = 2**np.arange(7)/80\n",
    "print(rates)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_inputs(sm):\n",
    "    seq_len = 220\n",
    "    sm = sm.split()\n",
    "    if len(sm)>218:\n",
    "        print('SMILES is too long ({:d})'.format(len(sm)))\n",
    "        sm = sm[:109]+sm[-109:]\n",
    "    ids = [vocab.stoi.get(token, unk_index) for token in sm]\n",
    "    ids = [sos_index] + ids + [eos_index]\n",
    "    seg = [1]*len(ids)\n",
    "    padding = [pad_index]*(seq_len - len(ids))\n",
    "    ids.extend(padding), seg.extend(padding)\n",
    "    return ids, seg\n",
    "\n",
    "def get_array(smiles):\n",
    "    x_id, x_seg = [], []\n",
    "    for sm in smiles:\n",
    "        a,b = get_inputs(sm)\n",
    "        x_id.append(a)\n",
    "        x_seg.append(b)\n",
    "    return torch.tensor(x_id), torch.tensor(x_seg)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### ECFP4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from rdkit import Chem\n",
    "from rdkit.Chem import AllChem\n",
    "\n",
    "def bit2np(bitvector):\n",
    "    bitstring = bitvector.ToBitString()\n",
    "    intmap = map(int, bitstring)\n",
    "    return np.array(list(intmap))\n",
    "\n",
    "def extract_morgan(smiles, targets):\n",
    "    x,X,y = [],[],[]\n",
    "    for sm,target in zip(smiles,targets):\n",
    "        mol = Chem.MolFromSmiles(sm)\n",
    "        if mol is None:\n",
    "            print(sm)\n",
    "            continue\n",
    "        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, 1024) # Morgan (Similar to ECFP4)\n",
    "        x.append(sm)\n",
    "        X.append(bit2np(fp))\n",
    "        y.append(target)\n",
    "    return x,np.array(X),np.array(y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### ST, RNN, BERT"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total parameters: 4245037\n",
      "Total parameters: 4713517\n",
      "Total parameters: 6330368\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "from pretrain_trfm import TrfmSeq2seq\n",
    "from pretrain_rnn import RNNSeq2Seq\n",
    "from bert import BERT\n",
    "from build_vocab import WordVocab\n",
    "from utils import split\n",
    "\n",
    "pad_index = 0\n",
    "unk_index = 1\n",
    "eos_index = 2\n",
    "sos_index = 3\n",
    "mask_index = 4\n",
    "\n",
    "vocab = WordVocab.load_vocab('data/vocab.pkl')\n",
    "\n",
    "trfm = TrfmSeq2seq(len(vocab), 256, len(vocab), 3)\n",
    "trfm.load_state_dict(torch.load('.save/trfm_12_23000.pkl'))\n",
    "trfm.eval()\n",
    "print('Total parameters:', sum(p.numel() for p in trfm.parameters()))\n",
    "\n",
    "rnn = RNNSeq2Seq(len(vocab), 256, len(vocab), 3)\n",
    "rnn.load_state_dict(torch.load('.save/seq2seq_1.pkl'))\n",
    "rnn.eval()\n",
    "print('Total parameters:', sum(p.numel() for p in rnn.parameters()))\n",
    "\n",
    "bert = BERT(len(vocab), hidden=256, n_layers=8, attn_heads=8, dropout=0)\n",
    "bert.load_state_dict(torch.load('../result/chembl/ep00_it010000.pkl'))\n",
    "bert.eval()\n",
    "print('Total parameters:', sum(p.numel() for p in bert.parameters()))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### GC"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import deepchem as dc\n",
    "from deepchem.models.tensorgraph.models.graph_models import GraphConvModel"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## HIV"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(41127, 3)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>smiles</th>\n",
       "      <th>activity</th>\n",
       "      <th>HIV_active</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>CCC1=[O+][Cu-3]2([O+]=C(CC)C1)[O+]=C(CC)CC(CC)...</td>\n",
       "      <td>CI</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>C(=Cc1ccccc1)C1=[O+][Cu-3]2([O+]=C(C=Cc3ccccc3...</td>\n",
       "      <td>CI</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>CC(=O)N1c2ccccc2Sc2c1ccc1ccccc21</td>\n",
       "      <td>CI</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Nc1ccc(C=Cc2ccc(N)cc2S(=O)(=O)O)c(S(=O)(=O)O)c1</td>\n",
       "      <td>CI</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>O=S(=O)(O)CCS(=O)(=O)O</td>\n",
       "      <td>CI</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                              smiles activity  HIV_active\n",
       "0  CCC1=[O+][Cu-3]2([O+]=C(CC)C1)[O+]=C(CC)CC(CC)...       CI           0\n",
       "1  C(=Cc1ccccc1)C1=[O+][Cu-3]2([O+]=C(C=Cc3ccccc3...       CI           0\n",
       "2                   CC(=O)N1c2ccccc2Sc2c1ccc1ccccc21       CI           0\n",
       "3    Nc1ccc(C=Cc2ccc(N)cc2S(=O)(=O)O)c(S(=O)(=O)O)c1       CI           0\n",
       "4                             O=S(=O)(O)CCS(=O)(=O)O       CI           0"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv('data/hiv.csv')\n",
    "print(df.shape)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAKMAAAEWCAYAAAAKHCfyAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4wLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvqOYd8AAADtpJREFUeJzt3X+MHPV5x/H3BwrCwfwosXM1hsZpilAAJ464UtKoks8kyE3aQChBpfwwDSWJWmiq0goL9QcJQaIqKVIQlUoF2KDAFfEj0KQtRW4OSkshZzDYQCMSYhobY9eAgXMtmiNP/5g5vD7uzuNjZ+bZu89LWt3tzOzMM/DxzM3Ofp9VRGCWwX5tF2A2xmG0NBxGS8NhtDQcRkvDYbQ0HEZLw2GsmaSNkj4xbtoFkh7unC/pY5J2SjpkgnU8Ienipmpui8OYREQ8AmwCfrNzuqQTgOOA29uoq0kOYy6rgfPHTTsf+E5EvNxCPY1yGHO5FfhVST8PIGk/4LeBW1qtqiEOYzO+JWnH2AP4m4kWiogfAw8C55aTTgEOAr7TTJntchibcXpEHD72AH5vimU7T9XnAbdFxE9qrzABhzGfu4GFkgaAM5glp2hwGNOJiJ3AncDNwAsRMdxySY1xGHNaDbyfWXRUBJA/XGtZ+MhoaTiMlobDaGk4jJbGz7RdQBXz5s2LRYsWNba9nTt3cvDBBze2vaY1vX9r167dHhHz97ZcT4Rx0aJFDA8393bb0NAQS5cubWx7TWt6/yS9UGU5n6YtDYfR0nAYLQ2H0dJwGC0Nh9HScBgtDYfR0nAYLY2euAMzkUUr6xujdOniUS6oYf0br/5019c5k/jIaGnUFkZJB0l6TNKTkp6W9JVy+ipJP5K0rnwsqasG6y11nqbfBJZFxIikA4CHJf1TOe9PIuLOGrdtPai2MEYxuGakfHpA+fCAG5tUrQOyJO0PrAV+Ebg+Ii6TtAr4GMWRcw2wMiLenOC1XwC+ANDX13fi4ODgHvPXb36ttrr75sDWXd1f7+KFh3V/pdMwMjLC3LlzG9vewMDA2ojo39tyjYwOlHQ4cA9wCfAy8BJwIHAD8MOI+OpUr+/v74/xn2es+2r66+u7f9LIcjXdwucZK4WxkavpiNgBDAHLI2JLFN6kGKh+UhM1WH51Xk3PL4+ISJoDfAL4L0kLymkCTgc21FWD9ZY6r6YXAKvLvxv3A+6IiG9L+ldJ8wEB64Av1ViD9ZA6r6afAj46wfRldW3TepvvwFgaDqOl4TBaGg6jpeEwWhoOo6XhMFoaDqOl4TBaGg6jpeEwWhoOo6XhMFoaDqOl4TBaGg6jpeEwWhoOo6XhMFoaDqOl0Ubjpw9IelTSc5L+XtKBddVgvaXOI+NY46ePAEuA5ZJOBv4SuDYijgFeBS6ssQbrIbWFsewaMVHjp2XAWAey1RQD+c3q7Vw7vvET8ENgR0SMlotsAhZO8trOxk8MDQ3tMf/SxaMTvKo7+ubUs/7x+9CWkZGRNLV0qjWMEfEWsKSj8dOHJlpsktfeQNEYiv7+/hjfqKiONsdjamv8dM7Srq9zOrJ+UWfTjZ9OBg6XNPZ/+ijgxSZqsPyabvz0LPBd4MxysRXAvXXVYL2ljcZPzwCDkr4GPAHcWGMN1kPaaPz0PO7JaBPwHRhLw2G0NBxGS8NhtDQcRkvDYbQ0HEZLw2G0NBxGS8NhtDQcRkvDYbQ0HEZLw2G0NBxGS8NhtDQcRkvDYbQ0HEZLw2G0NOocqnq0pO9KerZs/PTlcvoVkjZLWlc+PlVXDdZb6hyqOgpcGhGPSzoEWCvpgXLetRFxTY3bth5U51DVLcCW8vc3JD3LJH11zAAUMWGrm+5uRFoEPAScAPwRcAHwOjBMcfR8dYLXdDZ+OnFwcHCP+es3v1ZbvX1zYOuu7q938cLDur/SaRgZGWHu3LmNbW9gYGBtRPTvbbnawyhpLvAgcFVE3C2pD9hO0fDpSmBBRHx+qnX09/fH8PDwHtMW9WLjp6s/3fV1TkfTjZ8kVQpjrVfTkg4A7gK+GRF3A0TE1oh4KyJ+Cvwd7i5hpTqvpkXRR+fZiPjrjukLOhb7LLChrhqst9R5Nf1x4DxgvaR15bTLgbMlLaE4TW8EvlhjDdZD6ryafhjQBLP+sa5tWm/zHRhLw2G0NBxGS8NhtDQcRkvDYbQ0HEZLw2G0NBxGS8NhtDQcRkvDYbQ0HEZLw2G0NBxGS8NhtDQqhVHSlyUdqsKNkh6XdGrdxdnsUvXI+PmIeB04FZgP/A5wdW1V2axUNYxjwwc+BdwcEU8y8ZACs2mrGsa1kv6FIoz3l+1KflpfWTYbVQ3jhcBK4Jci4n+BAylO1ZOaovHTEZIekPRc+fNn39Ue2IxRNYwPRMTjEbEDICJeBq7dy2vGGj99CDgZ+H1Jx1GEek1EHAOsKZ+bTT1UVdJBwHuAeeURbOzvxEOBI6d67RSNn04DlpaLrQaGgMumV77NJHsbN/1F4A8pgreW3WF8Hbi+6kbKxk8fBR4F+sqgEhFbJL1vktd0Nn5iaGhoj/mXLh6tuvl91jennvWP34e2jIyMpKmlU6XGT5IuiYjrprWBdzZ+2hERh3fMfzUipvy70Y2fuitr46dK/8Uj4jpJvwIs6nxNRNyylyLe0fgJ2CppQXlUXABsq1KDzXyVwijpVuCDwDrgrXJyAJOGcbLGT8B9wAqKN81XAPfue9k2E1U9F/UDx8W+NXOcrPHT1cAdki4E/hv43D6s02awqmHcAPwc5dVxFVM0fgI4pep6bPaoGsZ5wDOSHgPeHJsYEZ+ppSqblaqG8Yo6izCD6lfTD9ZdiFnVq+k3KK6eobgvfQCwMyIOraswm32qHhkP6Xwu6XTcGN66bFrDDiLiW8CyLtdis1zV0/QZHU/3o3jfsf5vM7JZperV9G90/D5K8S0Fp3W9GpvVqv7NOOUHac26oerowKMk3SNpm6Stku6SdFTdxdnsUvUC5maKDzgcSfEB2X8op5l1TdUwzo+ImyNitHysohiyatY1VcO4XdK5kvYvH+cCL9dZmM0+lQfxA2cBL1F8cudM9jI60GxfVX1r50pgxdiXlEs6AriGIqRmXVH1yPjhsSACRMQrFAOszLqmahj36xxsXx4Z6/x6YJuFqgbq68B/SLqT4jbgWcBVtVVls1LVOzC3SBqm+HCEgDMi4plaK7NZp/Kptgxf5QBKugn4dWBbRJxQTrsCuAj4n3KxyyPCX4ZuQL2da1cByyeYfm1ELCkfDqK9rbYwRsRDwCt1rd9mnjZ6el8s6SlJN7kdnnWq1Gtn2isvGj59u+Nvxj5gO8UV+ZXAgoiY8I3zcY2fThwcHNxj/vrNr9VWd98c2Lqr++tdvPCw7q90GkZGRpg7d25j2xsYGKjUa6fRMFadN54bP3VX1sZPjZ6my0ZPYz5L0anCDKjxLoqk2ymags6TtAn4C2CppCUUp+mNFP0fzYAawxgRZ08w+ca6tme9z9+QZWk4jJaGw2hpOIyWhsNoaTiMlobDaGk4jJaGw2hpOIyWhsNoaTiMlobDaGk4jJaGw2hpOIyWhsNoaTiMlobDaGk4jJZGbWEsO0Zsk7ShY9oRkh6Q9Fz50x0l7G1NN35aCayJiGOANeVzM6D5xk+nAavL31cDp9e1fes9TbdC7ouILQARsUXS+yZbcFyvHYaGhvaYf+ni0fqKnFPP+sfvQ1tGRkbS1NIpbV/uiLgBuAGKXjvje8Nc0Iu9ds5Z2vV1TkfTvXaqavpqeutYv53y57aGt2+JNR3G+4AV5e8rgHsb3r4lVudbO7cDjwDHStok6ULgauCTkp4DPlk+NwOab/wEcEpd27TelvYCZraruxlqHReA77YZqm8HWhoOo6XhMFoaDqOl4TBaGg6jpeEwWhoOo6XhMFoaDqOl4TBaGg6jpeEwWhoOo6XhMFoaDqOl4TBaGg6jpeEwWhqtjIGRtBF4A3gLGI2I/jbqsFzaHJA1EBHbW9y+JePTtKWhiGh+o9KPgFeBAP627KszfpnOxk8nDg4O7jF//ebXaquvbw5s3dX99S5eeFjlZWfS/g0MDKyt8qdYW2E8MiJeLLuQPQBcUrbQm1B/f38MDw/vMa3uccW1NH7ah3HFM2n/JFUKYyun6Yh4sfy5DbgHOKmNOiyXxsMo6WBJh4z9DpwKbJj6VTYbtHE13QfcI2ls+7dFxD+3UIcl03gYI+J54CNNb9fy81s7lobDaGk4jJaGw2hpOIyWhsNoaTiMlobDaGk4jJaGw2hpOIyWhsNoaTiMlobDaGk4jJaGw2hpOIyWhsNoaTiMlobDaGm0EkZJyyV9X9IPJK1sowbLp41x0/sD1wO/BhwHnC3puKbrsHzaODKeBPwgIp6PiP8DBoHTWqjDkmm8146kM4HlEfG75fPzgF+OiIvHLfd24yfgWOD7DZY5D5jJ7fqa3r/3R8T8vS3URkcJTTDtHf8iys5k7+hO1gRJwzO5gWnW/WvjNL0JOLrj+VHAiy3UYcm0EcbvAcdI+oCkA4HfAu5roQ5Lpo1eO6OSLgbuB/YHboqIp5uuYy9a+fOgQSn3r5VmoWYT8R0YS8NhtDQcxg4z/TalpJskbZOUslOww1iaJbcpVwHL2y5iMg7jbjP+NmX5jRKvtF3HZBzG3RYCP+54vqmcZg1xGHerdJvS6uMw7ubblC1zGHfzbcqWOYyliBgFxm5TPgvckfA25bsi6XbgEeBYSZskXdh2TZ18O9DS8JHR0nAYLQ2H0dJwGC0Nh9HScBgbIukKSX/cdh2ZOYyWhsNYE0nnS3pK0pOSbh037yJJ3yvn3SXpPeX0z0naUE5/qJx2vKTHJK0r13dMG/vTBL/pXQNJxwN3Ax+PiO2SjgD+ABiJiGskvTciXi6X/RqwNSKuk7SeosHBZkmHR8QOSdcB/xkR3yxvU+4fEbva2rc6+chYj2XAnRGxHSAixn+G8ARJ/1aG7xzg+HL6vwOrJF1EMXISitt3l0u6jKIzw4wMIjiMdRFTf/xsFXBxRCwGvgIcBBARXwL+lOLTQ+vKI+htwGeAXcD9kpbVWXibHMZ6rAHOkvRegPI03ekQYIukAyiOjJTLfTAiHo2IP6fohXO0pF8Ano+Ib1B8iujDjexBC9rotTPjRcTTkq4CHpT0FvAEsLFjkT8DHgVeANZThBPgr8oLFFEE+klgJXCupJ8ALwFfbWQnWuALGEvDp2lLw2G0NBxGS8NhtDQcRkvDYbQ0HEZL4/8BZqCis0Rrc78AAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 144x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "df_large = df[np.array(list(map(len, df['smiles'])))>218]\n",
    "\n",
    "keys = ['0', '1']\n",
    "bottom = df_large.groupby('HIV_active').count()['smiles'].values\n",
    "plt.figure(figsize=(2,4))\n",
    "plt.bar(keys, bottom)\n",
    "plt.xlabel('class')\n",
    "plt.ylabel('counts')\n",
    "plt.title('HIV')\n",
    "plt.grid()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEWCAYAAABrDZDcAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4wLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvqOYd8AAAFkFJREFUeJzt3X2UZHV95/H3lxmU0UZQYTvs4GZIjssuZyY7kY5BSbQb1BBgxTUk4oJCZDMmBhddknXYnD26Z+OGXUNYYsyaWeVBRVoz4kNgo3DE1nAUdAZHhwcfWB11EGdEA9g4ixn87h/3dqhp+qG6uureLn7v1zl1uurWrfp95kLXp++tqt+NzESSVK6D2g4gSWqXRSBJhbMIJKlwFoEkFc4ikKTCWQSSVDiLQJIKZxFI84iIXRHxolnLzouIWzrvj4jnRcTDEXHoHM/xxYi4oKnMUi8sAmmZMvNzwG7gNzqXR8R64Djg2jZySd2yCKT+uBp49axlrwZuyMwftJBH6ppFIPXHe4FfjYh/BhARBwH/FnhPq6mkLlgE0sI+EhEPzFyAv5xrpcz8DvBp4Jx60cnAIcANzcSUemcRSAt7WWYePnMBXrfAup2Hh14FvD8z/2HgCaVlsgik/rkOWBsRE8DL8bCQhoRFIPVJZj4MbAWuBL6VmdtajiR1xSKQ+utq4Gdxb0BDJDwxjSSVzT0CSSqcRSBJhbMIJKlwFoEkFW512wG6ccQRR+S6devajrGghx9+mKc+9altx1jUsOSE4clqzv4yZ/9s3779/sw8crH1hqII1q1bx7ZtK/sj2VNTU4yPj7cdY1HDkhOGJ6s5+8uc/RMR3+pmPQ8NSVLhLAJJKpxFIEmFswgkqXAWgSQVziKQpMINrAgi4oqI2BsRd3Qse1tEfCUivhwRH46Iwwc1viSpO4PcI7gKOGXWspuA9Zn5C8DXgIsHOL4kqQsDK4LM/Azww1nLbszM/fXNW4GjBzW+JKk7Az0fQUSsA67PzPVz3Pc3wAcy833zPHYTsAlgdHT0+MnJyYHlnM/Oex/set3RNbBn34HLNqw9rM+Jlm96epqRkZG2Y3RlWLKas7/M2T8TExPbM3NssfVamWIiIv4I2A9cM986mbkF2AIwNjaWbXyV+7zNN3S97kUb9nPpzgM3566zx/ucaPmG4WvxM4Ylqzn7y5zNa7wIIuJc4HTg5PT0aJLUukaLICJOAd4EvDAzf9zk2JKkuQ3y46PXAp8Djo2I3RFxPvAXwKHATRGxIyLeOajxJUndGdgeQWa+co7F7x7UeJKk3vjNYkkqnEUgSYWzCCSpcBaBJBXOIpCkwlkEklQ4i0CSCmcRSFLhLAJJKpxFIEmFswgkqXAWgSQVziKQpMJZBJJUOItAkgpnEUhS4SwCSSqcRSBJhbMIJKlwFoEkFc4ikKTCWQSSVDiLQJIKZxFIUuEsAkkq3MCKICKuiIi9EXFHx7JnRMRNEfH1+ufTBzW+JKk7g9wjuAo4ZdayzcAnM/PZwCfr25KkFg2sCDLzM8APZy0+A7i6vn418LJBjS9J6k5k5uCePGIdcH1mrq9vP5CZh3fc//eZOefhoYjYBGwCGB0dPX5ycnJgOeez894Hu153dA3s2Xfgsg1rD+tzouWbnp5mZGSk7RhdGZas5uwvc/bPxMTE9swcW2y91U2E6UVmbgG2AIyNjeX4+HjjGc7bfEPX6160YT+X7jxwc+46e7zPiZZvamqKNrZlL4Ylqzn7y5zNa/pTQ3si4iiA+ufehseXJM3SdBF8DDi3vn4u8NGGx5ckzTLIj49eC3wOODYidkfE+cAlwIsj4uvAi+vbkqQWDew9gsx85Tx3nTyoMSVJS+c3iyWpcBaBJBXOIpCkwlkEklQ4i0CSCmcRSFLhLAJJKpxFIEmFswgkqXAWgSQVziKQpMJZBJJUOItAkgpnEUhS4SwCSSqcRSBJhbMIJKlwFoEkFc4ikKTCWQSSVDiLQJIKZxFIUuEsAkkqnEUgSYWzCCSpcBaBJBWulSKIiDdGxJ0RcUdEXBsRh7SRQ5LUQhFExFrg3wNjmbkeWAWc1XQOSVKlrUNDq4E1EbEaeArw3ZZySFLxIjObHzTiQuCtwD7gxsw8e451NgGbAEZHR4+fnJzsaayd9z64jKTdG10De/YduGzD2sMaGXsppqenGRkZaTtGV4Ylqzn7y5z9MzExsT0zxxZbr/EiiIinAx8CXgE8APw1sDUz3zffY8bGxnLbtm09jbdu8w09PW6pLtqwn0t3rj5g2a5LTmtk7KWYmppifHy87RhdGZas5uwvc/ZPRHRVBG0cGnoR8M3M/H5m/gNwHfD8FnJIkminCL4NnBART4mIAE4G7m4hhySJFoogM28DtgK3AzvrDFuaziFJqqxefJX+y8w3A29uY2xJ0oH8ZrEkFc4ikKTCWQSSVDiLQJIKZxFIUuEsAkkqnEUgSYWzCCSpcBaBJBXOIpCkwnVVBBFxYUQ8LSrvjojbI+Ilgw4nSRq8bvcIXpOZDwEvAY4Efhu4ZGCpJEmN6bYIov55KnBlZn6pY5kkaYh1WwTbI+JGqiL4REQcCvx0cLEkSU3pdhrq84GNwDcy88cR8Uyqw0OSpCHX7R7BTZl5e2Y+AJCZPwAuG1wsSVJTFtwjiIhDgKcAR9QnnZ95X+BpwD8dcDZJUgMWOzT0WuANVC/623msCB4C3jHAXJKkhixYBJl5OXB5RLw+M9/eUCZJUoO6erM4M98eEc8H1nU+JjPfM6BckqSGdFUEEfFe4OeBHcCj9eIELAJJGnLdfnx0DDguM3OQYSRJzev246N3AD8zyCCSpHZ0u0dwBHBXRHweeGRmYWa+dCCpJEmN6bYI3jLIEJKk9nT7qaFP93PQiDgceBewnupN59dk5uf6OYYkqTvdfmroR1Qv2ABPAg4GHs7Mp/U47uXAxzPzzIh4EtW3lyVJLeh2j+DQztsR8TLgub0MGBFPA14AnFc/90+An/TyXJKk5YtePxEaEbdm5gk9PG4jsAW4C/hXVFNXXJiZD89abxOwCWB0dPT4ycnJnnLuvPfBnh63VKNrYM++RoZa1Ia1h8173/T0NCMjIw2m6d2wZDVnf5mzfyYmJrZn5thi63VVBBHx8o6bB1F9r+CFmfm8pQaLiDHgVuDEzLwtIi4HHsrM/zzfY8bGxnLbtm1LHQqAdZtv6OlxS3XRhv1curPb994Ha9clp81739TUFOPj482FWYZhyWrO/jJn/0REV0XQ7SvXv+64vh/YBZzRQy6A3cDuzLytvr0V2Nzjc0mSlqnb9wj6dhKazPxeRHwnIo7NzK8CJ1MdJpIktaCrbxZHxNER8eGI2BsReyLiQxFx9DLGfT1wTUR8merMZ/9tGc8lSVqGbg8NXQm8H/jN+vY59bIX9zJoZu6gep9BktSybucaOjIzr8zM/fXlKuDIAeaSJDWk2yK4PyLOiYhV9eUc4AeDDCZJaka3RfAa4LeA7wH3AWcCfXsDWZLUnm7fI/ivwLmZ+fcAEfEM4E+pCkKSNMS63SP4hZkSAMjMHwK/OJhIkqQmdVsEB0XE02du1HsEK+NrtJKkZen2xfxS4LMRsZVqFtLfAt46sFSSpMZ0+83i90TENuAkIICXZ6bfBpakJ4CuD+/UL/y++EvSE0y37xFIkp6gLAJJKpxFIEmFswgkqXAWgSQVziKQpMJZBJJUOItAkgpnEUhS4SwCSSqcRSBJhbMIJKlwFoEkFc4ikKTCWQSSVDiLQJIKZxFIUuFaK4KIWBURX4yI69vKIElqd4/gQuDuFseXJNFSEUTE0cBpwLvaGF+S9JjIzOYHjdgK/AlwKPAHmXn6HOtsAjYBjI6OHj85OdnTWDvvfXAZSbs3ugb27GtkqEVtWHvYvPdNT08zMjLSYJreDUtWc/aXOftnYmJie2aOLbbe6ibCdIqI04G9mbk9IsbnWy8ztwBbAMbGxnJ8fN5VF3Te5ht6etxSXbRhP5fubHxzzmnX2ePz3jc1NUWv27Jpw5LVnP1lzua1cWjoROClEbELmAROioj3tZBDkkQLRZCZF2fm0Zm5DjgLuDkzz2k6hySp4vcIJKlwrR7UzswpYKrNDJJUOvcIJKlwFoEkFc4ikKTCWQSSVDiLQJIKZxFIUuEsAkkqnEUgSYWzCCSpcBaBJBXOIpCkwlkEklQ4i0CSCmcRSFLhLAJJKpxFIEmFswgkqXAWgSQVziKQpMJZBJJUOItAkgpnEUhS4SwCSSqcRSBJhbMIJKlwjRdBRDwrIj4VEXdHxJ0RcWHTGSRJj1ndwpj7gYsy8/aIOBTYHhE3ZeZdLWSRpOI1vkeQmfdl5u319R8BdwNrm84hSapEZrY3eMQ64DPA+sx8aNZ9m4BNAKOjo8dPTk72NMbOex9cXsguja6BPfsaGWpRG9YeNu9909PTjIyMDGTcfm/rprbpQturG4PcpgtZ6vbu3J7L/TcPUlvbc6mGIefExMT2zBxbbL3WiiAiRoBPA2/NzOsWWndsbCy3bdvW0zjrNt/Q0+OW6qIN+7l0ZxtH2h5v1yWnzXvf1NQU4+PjAxm339u6qW260PbqxiC36UKWur07t+dy/82D1Nb2XKphyBkRXRVBK58aioiDgQ8B1yxWApKkwWrjU0MBvBu4OzP/rOnxJUkHamOP4ETgVcBJEbGjvpzaQg5JEi18fDQzbwGi6XElSXPzm8WSVDiLQJIKZxFIUuEsAkkqnEUgSYWzCCSpcBaBJBXOIpCkwlkEklQ4i0CSCmcRSFLhLAJJKtzKOJOK+mqhE5ZctGE/5zV0sp5hsdwT6ixnm67kE8SoP5b7/1cT/4+4RyBJhbMIJKlwFoEkFc4ikKTCWQSSVDiLQJIKZxFIUuEsAkkqnEUgSYWzCCSpcBaBJBXOIpCkwlkEklS4VoogIk6JiK9GxD0RsbmNDJKkSuNFEBGrgHcAvw4cB7wyIo5rOockqdLGHsFzgXsy8xuZ+RNgEjijhRySJCAys9kBI84ETsnMf1fffhXwy5l5waz1NgGb6pvHAl9tNOjSHQHc33aILgxLThierObsL3P2z89m5pGLrdTGGcpijmWPa6PM3AJsGXyc/oiIbZk51naOxQxLThierObsL3M2r41DQ7uBZ3XcPhr4bgs5JEm0UwRfAJ4dEcdExJOAs4CPtZBDkkQLh4Yyc39EXAB8AlgFXJGZdzadYwCG5TDWsOSE4clqzv4yZ8Maf7NYkrSy+M1iSSqcRSBJhbMIuhQRz4qIT0XE3RFxZ0RcWC9/S0TcGxE76supHY+5uJ5G46sR8WsN5TwkIj4fEV+qc/6XevkxEXFbRHw9Ij5Qv1FPRDy5vn1Pff+6lnNeFRHf7NieG+vlERF/Xuf8ckQ8p4mcHXlXRcQXI+L6+vaK2p4L5Fxx2zMidkXEzjrPtnrZMyLipnp73hQRT2875wJZV9TvfF9kppcuLsBRwHPq64cCX6OaIuMtwB/Msf5xwJeAJwPHAP8XWNVAzgBG6usHA7cBJwAfBM6ql78T+L36+uuAd9bXzwI+0ND2nC/nVcCZc6x/KvC39eNOAG5r+L//fwDeD1xf315R23OBnCtuewK7gCNmLfsfwOb6+mbgv7edc4GsK+p3vh8X9wi6lJn3Zebt9fUfAXcDaxd4yBnAZGY+kpnfBO6hml5j0DkzM6frmwfXlwROArbWy68GXtaR8+r6+lbg5IiY60t/TeWczxnAe+rH3QocHhFHDTonQEQcDZwGvKu+Hayw7TlXzkW0tj0XyDOz3WZvz5WUcyGt/M73g0XQg3p3/xep/ooFuKDebb1iZpeWqiS+0/Gw3SxcHP3MtyoidgB7gZuo/jJ5IDP3z5HlH3PW9z8IPLONnJk5sz3fWm/PyyLiybNz1hrbnsD/BP4j8NP69jNZgdtzjpwzVtr2TODGiNge1VQyAKOZeR9Uf3QB/2QF5IS5s8IK+51fLotgiSJiBPgQ8IbMfAj4X8DPAxuB+4BLZ1ad4+GNfFY3Mx/NzI1U39p+LvAvF8iyYnJGxHrgYuBfAL8EPAN4U5s5I+J0YG9mbu9cvECWlZQTVtj2rJ2Ymc+hmoH49yPiBQus22ZOmDvrivudXy6LYAki4mCqErgmM68DyMw99QvaT4H/zWO7gq1PpZGZDwBTVMdWD4+ImS8Qdmb5x5z1/YcBP2wp5yn1IbjMzEeAK2l/e54IvDQidlHNlHsS1V/eK217Pi5nRLxvBW5PMvO79c+9wIfrTHtmDvnUP/e2nXO+rCv5d75XFkGX6uO87wbuzsw/61jeebzy3wB31Nc/BpxVf4rkGODZwOcbyHlkRBxeX18DvIjq/YxPAWfWq50LfLQj57n19TOBm7N+56uFnF/peDEIquPEndvz1fWnSE4AHpw5lDBImXlxZh6dmeuo3vy9OTPPZoVtz3lynrPStmdEPDUiDp25DrykztS53WZvz8ZzLpR1pf3O90Mbs48OqxOBVwE76+PaAP+J6sQ6G6l2AXcBrwXIzDsj4oPAXcB+4Pcz89EGch4FXB3VCYAOAj6YmddHxF3AZET8MfBFqlKj/vneiLiH6i/XsxrIuFDOmyPiSKrd7B3A79br/x+qT5DcA/wY+O2Gcs7nTays7Tmfa1bY9hwFPly/f74aeH9mfjwivgB8MCLOB74N/GbLORfK+t4V9ju/bE4xIUmF89CQJBXOIpCkwlkEklQ4i0CSCmcRSFLhLAINpYj4o6hmLf1yPQPkL9fLpyLi253z+0TERyJiur6+LiLuqK+PRz1L56znnopq9siZ2SW31suPre/bEdUstI87Q1Xn8/f53zseEc/vuH1VRJy50GOkbvk9Ag2diHgecDrVbLCPRMQRwJM6VnmA6nsft9RfWutlkrKzM3PbrGV/DlyWmR+tc2zo4Xl7NQ5MA59tcEwVwj0CDaOjgPvraRPIzPtnpgKoTfLYF7leDlzXx3F3z9zIzJ0LrRzVpHpvi4gv1Hsur62Xj9d7Flsj4isRcc3MHkxEnFovuyWqefivj2qSw98F3ljvjfxqPcQLIuKzEfEN9w60HBaBhtGNwLMi4msR8ZcR8cJZ93+S6kVyFfU5AXoY45qOQ0Nvq5ddBtwcEX8bEW+cmSJjAedTTYnwS1STvv1OPfUAVLPXvoFqDvufA06MiEOAvwJ+PTN/BTgSIDN3UZ3z4LLM3JiZf1c/x1HAr1DtHV3Sw79RAiwCDaH6PAbHA5uA7wMfiIjzOlZ5FLgFeAWwpn4hXaqz6xfdjZn5h/W4V1LN5PrXVIdqbo3HpnWey0uo5snZQTVl+TOp5p8B+Hxm7q4nLtsBrKOaJfQb9Vz2ANcukvEjmfnTzLyLajoEqScWgYZSPfvjVGa+GbgA+I1Zq0wCb6c6k1g/x/1uZl6RmWdQzSezfoHVA3h9R6Eck5k31vc90rHeo1Tv1y31BDadz9HIyW/0xGQRaOjUn955dseijcC3Zq32d8CfsPhf1UsZ95SopiInIn6G6i/8exd4yCeA3+t4zD+vZ7Gcz1eAn4vHznP8io77fkR1ilSp7/zUkIbRCPD2+hj9fqqZKTvPHkU99fOfdvFcJ0fE7o7bM7NeXhMR++rr92fmi6gO9VweEf+vXv6Hmfm9BZ77XVSHfG6v3wz+Po+dgvFxMnNfRLwO+HhE3M+BUxj/DbA1Is4AXt/Fv0vqmrOPSitIRIxk5nRdHO8Avp6Zl7WdS09sHhqSVpbfqd9cvpPq7GZ/1XIeFcA9AkkqnHsEklQ4i0CSCmcRSFLhLAJJKpxFIEmF+//19BJmzSR0WgAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.hist(list(map(len, df_large['smiles'].values)), bins=20)\n",
    "plt.xlabel('SMILES length')\n",
    "plt.ylabel('counts')\n",
    "plt.title('HIV')\n",
    "plt.grid()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train = df[np.array(list(map(len, df['smiles'])))<=218]\n",
    "df_test = df[np.array(list(map(len, df['smiles'])))>218]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def ablation_hiv(X, X_test, y, y_test, rate, n_repeats):\n",
    "    auc = np.empty(n_repeats)\n",
    "    for i in range(n_repeats):\n",
    "        clf = MLPClassifier(max_iter=1000)\n",
    "        if rate==1:\n",
    "            X_train, y_train = X,y\n",
    "        else:\n",
    "            X_train, _, y_train, __ = train_test_split(X, y, test_size=1-rate, stratify=y)\n",
    "        clf.fit(X_train, y_train)\n",
    "        y_score = clf.predict_proba(X_test)\n",
    "        auc[i] = roc_auc_score(y_test, y_score[:,1])\n",
    "    ret = {}\n",
    "    ret['auc mean'] = np.mean(auc)\n",
    "    ret['auc std'] = np.std(auc)\n",
    "    return ret"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def ablation_hiv_dc(dataset, test_data, rate, n_repeats):\n",
    "    auc = np.empty(n_repeats)\n",
    "    for i in range(n_repeats):\n",
    "        clf = GraphConvModel(n_tasks=1, batch_size=64, mode='classification')\n",
    "        splitter = dc.splits.RandomStratifiedSplitter()\n",
    "        train_data, _, __ = splitter.train_valid_test_split(dataset, frac_train=rate, frac_valid=1-rate, frac_test=0)\n",
    "        clf.fit(train_data)\n",
    "        metrics = [dc.metrics.Metric(dc.metrics.roc_auc_score)]\n",
    "        scores = clf.evaluate(test_data, metrics)\n",
    "        auc[i] = scores['roc_auc_score']\n",
    "    ret = {}\n",
    "    ret['auc mean'] = np.mean(auc)\n",
    "    ret['auc std'] = np.std(auc)\n",
    "    return ret"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### ST"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "There are 41082 molecules. It will take a little time.\n",
      "(41082, 1024)\n",
      "SMILES is too long (220)\n",
      "SMILES is too long (274)\n",
      "SMILES is too long (247)\n",
      "SMILES is too long (226)\n",
      "SMILES is too long (244)\n",
      "SMILES is too long (243)\n",
      "SMILES is too long (253)\n",
      "SMILES is too long (266)\n",
      "SMILES is too long (346)\n",
      "SMILES is too long (232)\n",
      "SMILES is too long (242)\n",
      "SMILES is too long (247)\n",
      "SMILES is too long (240)\n",
      "SMILES is too long (370)\n",
      "SMILES is too long (224)\n",
      "SMILES is too long (283)\n",
      "SMILES is too long (265)\n",
      "SMILES is too long (240)\n",
      "SMILES is too long (219)\n",
      "SMILES is too long (246)\n",
      "SMILES is too long (243)\n",
      "SMILES is too long (284)\n",
      "SMILES is too long (270)\n",
      "SMILES is too long (232)\n",
      "SMILES is too long (260)\n",
      "SMILES is too long (284)\n",
      "SMILES is too long (284)\n",
      "SMILES is too long (439)\n",
      "SMILES is too long (491)\n",
      "SMILES is too long (439)\n",
      "SMILES is too long (296)\n",
      "SMILES is too long (341)\n",
      "SMILES is too long (285)\n",
      "SMILES is too long (327)\n",
      "SMILES is too long (341)\n",
      "SMILES is too long (400)\n",
      "SMILES is too long (263)\n",
      "SMILES is too long (238)\n",
      "SMILES is too long (383)\n",
      "SMILES is too long (360)\n",
      "SMILES is too long (233)\n",
      "SMILES is too long (365)\n",
      "SMILES is too long (265)\n",
      "SMILES is too long (240)\n",
      "SMILES is too long (223)\n",
      "(45, 1024)\n"
     ]
    }
   ],
   "source": [
    "x_split = [split(sm) for sm in df_train['smiles'].values]\n",
    "xid, _ = get_array(x_split)\n",
    "X = trfm.encode(torch.t(xid))\n",
    "print(X.shape)\n",
    "x_split = [split(sm) for sm in df_test['smiles'].values]\n",
    "xid, _ = get_array(x_split)\n",
    "X_test = trfm.encode(torch.t(xid))\n",
    "print(X_test.shape)\n",
    "y, y_test = df_train['HIV_active'].values, df_test['HIV_active'].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.0125 {'auc mean': 0.5862654320987655, 'auc std': 0.14652686355106523}\n",
      "0.025 {'auc mean': 0.5631172839506172, 'auc std': 0.11280209739337771}\n",
      "0.05 {'auc mean': 0.6125, 'auc std': 0.14407490386105684}\n",
      "0.1 {'auc mean': 0.5421296296296296, 'auc std': 0.13119017677727712}\n",
      "0.2 {'auc mean': 0.5924382716049382, 'auc std': 0.15169203612117552}\n",
      "0.4 {'auc mean': 0.6316358024691358, 'auc std': 0.11952566463707663}\n",
      "0.8 {'auc mean': 0.7474537037037037, 'auc std': 0.08832672645487213}\n",
      "0.6107914462081129\n"
     ]
    }
   ],
   "source": [
    "scores = []\n",
    "for rate in rates:\n",
    "    score_dic = ablation_hiv(X, X_test, y, y_test, rate, 20)\n",
    "    print(rate, score_dic)\n",
    "    scores.append(score_dic['auc mean'])\n",
    "print(np.mean(scores))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'auc mean': 0.7293981481481482, 'auc std': 0.05145690962211546}\n"
     ]
    }
   ],
   "source": [
    "score_dic = ablation_hiv(X, X_test, y, y_test, 1, 20)\n",
    "print(score_dic)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### ECFP"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "41082 41082\n",
      "45 45\n"
     ]
    }
   ],
   "source": [
    "x,X,y = extract_morgan(df_train['smiles'].values, df_train['HIV_active'].values)\n",
    "print(len(X), len(y))\n",
    "x,X_test,y_test = extract_morgan(df_test['smiles'].values, df_test['HIV_active'].values)\n",
    "print(len(X_test), len(y_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.0125 {'auc mean': 0.5419753086419754, 'auc std': 0.04503846854450115}\n",
      "0.025 {'auc mean': 0.5503086419753087, 'auc std': 0.0564072236675212}\n",
      "0.05 {'auc mean': 0.5694444444444444, 'auc std': 0.08849201955412457}\n",
      "0.1 {'auc mean': 0.5827160493827162, 'auc std': 0.11926366834883169}\n",
      "0.2 {'auc mean': 0.6459876543209876, 'auc std': 0.13795329598523573}\n",
      "0.4 {'auc mean': 0.755246913580247, 'auc std': 0.08677780763643579}\n",
      "0.8 {'auc mean': 0.8388888888888889, 'auc std': 0.06304612034579853}\n",
      "0.640652557319224\n"
     ]
    }
   ],
   "source": [
    "scores = []\n",
    "for rate in rates:\n",
    "    score_dic = ablation_hiv(X, X_test, y, y_test, rate, 20)\n",
    "    print(rate, score_dic)\n",
    "    scores.append(score_dic['auc mean'])\n",
    "print(np.mean(scores))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'auc mean': 0.8560185185185183, 'auc std': 0.03140384702527043}\n"
     ]
    }
   ],
   "source": [
    "score_dic = ablation_hiv(X, X_test, y, y_test, 1, 20)\n",
    "print(score_dic)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### RNN"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "There are 41082 molecules. It will take a little time.\n",
      "(41082, 1024)\n",
      "SMILES is too long (220)\n",
      "SMILES is too long (274)\n",
      "SMILES is too long (247)\n",
      "SMILES is too long (226)\n",
      "SMILES is too long (244)\n",
      "SMILES is too long (243)\n",
      "SMILES is too long (253)\n",
      "SMILES is too long (266)\n",
      "SMILES is too long (346)\n",
      "SMILES is too long (232)\n",
      "SMILES is too long (242)\n",
      "SMILES is too long (247)\n",
      "SMILES is too long (240)\n",
      "SMILES is too long (370)\n",
      "SMILES is too long (224)\n",
      "SMILES is too long (283)\n",
      "SMILES is too long (265)\n",
      "SMILES is too long (240)\n",
      "SMILES is too long (219)\n",
      "SMILES is too long (246)\n",
      "SMILES is too long (243)\n",
      "SMILES is too long (284)\n",
      "SMILES is too long (270)\n",
      "SMILES is too long (232)\n",
      "SMILES is too long (260)\n",
      "SMILES is too long (284)\n",
      "SMILES is too long (284)\n",
      "SMILES is too long (439)\n",
      "SMILES is too long (491)\n",
      "SMILES is too long (439)\n",
      "SMILES is too long (296)\n",
      "SMILES is too long (341)\n",
      "SMILES is too long (285)\n",
      "SMILES is too long (327)\n",
      "SMILES is too long (341)\n",
      "SMILES is too long (400)\n",
      "SMILES is too long (263)\n",
      "SMILES is too long (238)\n",
      "SMILES is too long (383)\n",
      "SMILES is too long (360)\n",
      "SMILES is too long (233)\n",
      "SMILES is too long (365)\n",
      "SMILES is too long (265)\n",
      "SMILES is too long (240)\n",
      "SMILES is too long (223)\n",
      "(45, 1024)\n"
     ]
    }
   ],
   "source": [
    "x_split = [split(sm) for sm in df_train['smiles'].values]\n",
    "xid, _ = get_array(x_split)\n",
    "X = rnn.encode(torch.t(xid))\n",
    "print(X.shape)\n",
    "x_split = [split(sm) for sm in df_test['smiles'].values]\n",
    "xid, _ = get_array(x_split)\n",
    "X_test = rnn.encode(torch.t(xid))\n",
    "print(X_test.shape)\n",
    "y, y_test = df_train['HIV_active'].values, df_test['HIV_active'].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.0125 {'auc mean': 0.6413580246913579, 'auc std': 0.11069279967556718}\n",
      "0.025 {'auc mean': 0.6655864197530865, 'auc std': 0.056627812535610146}\n",
      "0.05 {'auc mean': 0.644753086419753, 'auc std': 0.07297865449282823}\n",
      "0.1 {'auc mean': 0.6932098765432098, 'auc std': 0.06506711622922942}\n",
      "0.2 {'auc mean': 0.7206018518518518, 'auc std': 0.08554099486619125}\n",
      "0.4 {'auc mean': 0.6810185185185185, 'auc std': 0.06736796063488669}\n",
      "0.8 {'auc mean': 0.7066358024691357, 'auc std': 0.05058786972816301}\n",
      "0.6790233686067019\n"
     ]
    }
   ],
   "source": [
    "scores = []\n",
    "for rate in rates:\n",
    "    score_dic = ablation_hiv(X, X_test, y, y_test, rate, 20)\n",
    "    print(rate, score_dic)\n",
    "    scores.append(score_dic['auc mean'])\n",
    "print(np.mean(scores))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'auc mean': 0.6912037037037038, 'auc std': 0.09106494426547503}\n"
     ]
    }
   ],
   "source": [
    "score_dic = ablation_hiv(X, X_test, y, y_test, 1, 20)\n",
    "print(score_dic)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### GC"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading raw samples now.\n",
      "shard_size: 8192\n",
      "About to start loading CSV from data/hiv.csv\n",
      "Loading shard 1 of size 8192.\n",
      "Featurizing sample 0\n",
      "Featurizing sample 1000\n",
      "Featurizing sample 2000\n",
      "Featurizing sample 3000\n",
      "Featurizing sample 4000\n",
      "Featurizing sample 5000\n",
      "Featurizing sample 6000\n",
      "Featurizing sample 7000\n",
      "Featurizing sample 8000\n",
      "TIMING: featurizing shard 0 took 34.156 s\n",
      "Loading shard 2 of size 8192.\n",
      "Featurizing sample 0\n",
      "Featurizing sample 1000\n",
      "Featurizing sample 2000\n",
      "Featurizing sample 3000\n",
      "Featurizing sample 4000\n",
      "Featurizing sample 5000\n",
      "Featurizing sample 6000\n",
      "Featurizing sample 7000\n",
      "Featurizing sample 8000\n",
      "TIMING: featurizing shard 1 took 35.162 s\n",
      "Loading shard 3 of size 8192.\n",
      "Featurizing sample 0\n",
      "Featurizing sample 1000\n",
      "Featurizing sample 2000\n",
      "Featurizing sample 3000\n",
      "Featurizing sample 4000\n",
      "Featurizing sample 5000\n",
      "Featurizing sample 6000\n",
      "Featurizing sample 7000\n",
      "Featurizing sample 8000\n",
      "TIMING: featurizing shard 2 took 36.376 s\n",
      "Loading shard 4 of size 8192.\n",
      "Featurizing sample 0\n",
      "Featurizing sample 1000\n",
      "Featurizing sample 2000\n",
      "Featurizing sample 3000\n",
      "Featurizing sample 4000\n",
      "Featurizing sample 5000\n",
      "Featurizing sample 6000\n",
      "Featurizing sample 7000\n",
      "Featurizing sample 8000\n",
      "TIMING: featurizing shard 3 took 35.868 s\n",
      "Loading shard 5 of size 8192.\n",
      "Featurizing sample 0\n",
      "Featurizing sample 1000\n",
      "Featurizing sample 2000\n",
      "Featurizing sample 3000\n",
      "Featurizing sample 4000\n",
      "Featurizing sample 5000\n",
      "Featurizing sample 6000\n",
      "Featurizing sample 7000\n",
      "Featurizing sample 8000\n",
      "TIMING: featurizing shard 4 took 36.968 s\n",
      "Loading shard 6 of size 8192.\n",
      "Featurizing sample 0\n",
      "TIMING: featurizing shard 5 took 0.968 s\n",
      "TIMING: dataset construction took 220.752 s\n",
      "Loading dataset from disk.\n"
     ]
    }
   ],
   "source": [
    "featurizer = dc.feat.ConvMolFeaturizer()\n",
    "loader = dc.data.CSVLoader(\n",
    "      tasks=['HIV_active'],\n",
    "      smiles_field='smiles',\n",
    "      featurizer=featurizer)\n",
    "dataset = loader.featurize('data/hiv.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "TIMING: dataset construction took 62.290 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 20.102 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 0.422 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 40.842 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 40.622 s\n",
      "Loading dataset from disk.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/honda/anaconda3/envs/deepchem/lib/python3.5/site-packages/tensorflow/python/ops/gradients_impl.py:98: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n",
      "  \"Converting sparse IndexedSlices to a dense Tensor of unknown shape. \"\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computed_metrics: [0.7407407407407407]\n",
      "TIMING: dataset construction took 0.252 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 41.058 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 40.823 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.617283950617284]\n",
      "TIMING: dataset construction took 0.437 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 41.790 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 41.036 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.45987654320987653]\n",
      "TIMING: dataset construction took 0.602 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 42.796 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 42.979 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.6604938271604939]\n",
      "TIMING: dataset construction took 0.290 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 42.907 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 41.147 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.46604938271604934]\n",
      "TIMING: dataset construction took 0.532 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 41.164 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 40.800 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.5030864197530864]\n",
      "TIMING: dataset construction took 0.337 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 42.288 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 40.886 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.5154320987654322]\n",
      "TIMING: dataset construction took 0.306 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 40.950 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 40.854 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.6265432098765431]\n",
      "TIMING: dataset construction took 0.533 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 40.520 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 40.534 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.3055555555555556]\n",
      "TIMING: dataset construction took 0.601 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 40.703 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 40.371 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.5555555555555556]\n",
      "TIMING: dataset construction took 0.441 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 41.295 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 40.891 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.6018518518518519]\n",
      "TIMING: dataset construction took 0.445 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 41.260 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 40.869 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.6111111111111112]\n",
      "TIMING: dataset construction took 0.539 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 42.187 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 41.024 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.712962962962963]\n",
      "TIMING: dataset construction took 0.536 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 40.699 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 40.579 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.6296296296296295]\n",
      "TIMING: dataset construction took 0.392 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 40.745 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 40.518 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.7469135802469136]\n",
      "TIMING: dataset construction took 0.384 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 40.727 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 40.747 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.49074074074074076]\n",
      "TIMING: dataset construction took 0.597 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 42.467 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 43.646 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.5740740740740742]\n",
      "TIMING: dataset construction took 0.495 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 42.901 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 41.727 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.6635802469135803]\n",
      "TIMING: dataset construction took 0.456 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 42.953 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 41.202 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.6419753086419753]\n",
      "TIMING: dataset construction took 0.538 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 40.650 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 40.512 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.6820987654320987]\n",
      "0.0125 {'auc std': 0.10627271985136169, 'auc mean': 0.5902777777777778}\n",
      "TIMING: dataset construction took 0.817 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 42.123 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 40.492 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.537037037037037]\n",
      "TIMING: dataset construction took 0.931 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 40.380 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 40.629 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.7592592592592593]\n",
      "TIMING: dataset construction took 0.862 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 40.497 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 41.132 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.6419753086419753]\n",
      "TIMING: dataset construction took 0.993 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 39.953 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 39.951 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.5895061728395061]\n",
      "TIMING: dataset construction took 0.971 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 40.091 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 39.817 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.5864197530864197]\n",
      "TIMING: dataset construction took 0.741 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 40.882 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 41.331 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.6265432098765431]\n",
      "TIMING: dataset construction took 1.016 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 40.630 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 40.317 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.34567901234567905]\n",
      "TIMING: dataset construction took 0.745 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 42.707 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 42.733 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.7469135802469136]\n",
      "TIMING: dataset construction took 0.907 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 40.187 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 40.906 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.6820987654320988]\n",
      "TIMING: dataset construction took 0.941 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 41.474 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 40.327 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.6388888888888888]\n",
      "TIMING: dataset construction took 0.785 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 40.318 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 40.092 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.5648148148148148]\n",
      "TIMING: dataset construction took 0.945 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 40.557 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 39.966 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.5]\n",
      "TIMING: dataset construction took 0.837 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 41.593 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 40.240 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.75]\n",
      "TIMING: dataset construction took 0.773 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 40.405 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 40.367 s\n",
      "Loading dataset from disk.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computed_metrics: [0.6450617283950617]\n",
      "TIMING: dataset construction took 0.845 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 40.368 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 39.981 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.6604938271604939]\n",
      "TIMING: dataset construction took 0.731 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 41.865 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 40.573 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.5370370370370371]\n",
      "TIMING: dataset construction took 1.214 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 40.154 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 39.725 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.8024691358024691]\n",
      "TIMING: dataset construction took 1.098 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 42.479 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 41.651 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.4012345679012346]\n",
      "TIMING: dataset construction took 0.906 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 41.692 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 39.998 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.6975308641975309]\n",
      "TIMING: dataset construction took 1.148 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 40.566 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 40.291 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.6481481481481481]\n",
      "0.025 {'auc std': 0.11309475463246396, 'auc mean': 0.6180555555555556}\n",
      "TIMING: dataset construction took 2.152 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 39.281 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 38.958 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.7314814814814814]\n",
      "TIMING: dataset construction took 1.966 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 40.926 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 39.322 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.6141975308641976]\n",
      "TIMING: dataset construction took 2.282 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 39.290 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 39.231 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.6388888888888888]\n",
      "TIMING: dataset construction took 2.071 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 39.090 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 38.790 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.6265432098765433]\n",
      "TIMING: dataset construction took 1.879 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 40.623 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 39.166 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.6018518518518519]\n",
      "TIMING: dataset construction took 1.810 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 39.015 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 39.162 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.4598765432098766]\n",
      "TIMING: dataset construction took 1.716 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 39.215 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 39.408 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.6450617283950617]\n",
      "TIMING: dataset construction took 2.682 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 38.464 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 38.899 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.7685185185185186]\n",
      "TIMING: dataset construction took 1.749 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 39.289 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 39.121 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.5709876543209876]\n",
      "TIMING: dataset construction took 1.605 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 39.645 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 39.189 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.7932098765432098]\n",
      "TIMING: dataset construction took 1.889 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 39.204 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 38.883 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.49382716049382713]\n",
      "TIMING: dataset construction took 2.072 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 38.882 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 38.758 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.8734567901234568]\n",
      "TIMING: dataset construction took 2.077 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 39.074 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 38.767 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.4320987654320988]\n",
      "TIMING: dataset construction took 2.178 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 38.869 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 38.601 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.5895061728395061]\n",
      "TIMING: dataset construction took 1.900 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 41.567 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 39.280 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.5925925925925926]\n",
      "TIMING: dataset construction took 2.052 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 39.053 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 38.718 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.6790123456790124]\n",
      "TIMING: dataset construction took 2.180 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 41.348 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 39.304 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.7253086419753086]\n",
      "TIMING: dataset construction took 1.925 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 39.093 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 38.836 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.5030864197530864]\n",
      "TIMING: dataset construction took 2.500 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 40.667 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 38.996 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.7438271604938271]\n",
      "TIMING: dataset construction took 1.910 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 41.209 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 41.725 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.5802469135802468]\n",
      "0.05 {'auc std': 0.11241635190615207, 'auc mean': 0.6331790123456791}\n",
      "TIMING: dataset construction took 4.121 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 36.856 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 36.729 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.787037037037037]\n",
      "TIMING: dataset construction took 3.913 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 37.810 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 36.844 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.4382716049382716]\n",
      "TIMING: dataset construction took 3.896 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 37.099 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 36.812 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.6388888888888888]\n",
      "TIMING: dataset construction took 3.976 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 39.273 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 37.318 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.5679012345679013]\n",
      "TIMING: dataset construction took 4.032 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 36.878 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 36.855 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.6481481481481481]\n",
      "TIMING: dataset construction took 3.512 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 37.666 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 37.300 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.6759259259259258]\n",
      "TIMING: dataset construction took 4.287 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 37.514 s\n",
      "Loading dataset from disk.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "TIMING: dataset construction took 36.613 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.5401234567901234]\n",
      "TIMING: dataset construction took 4.005 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 37.509 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 37.128 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.6820987654320987]\n",
      "TIMING: dataset construction took 3.768 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 37.242 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 37.010 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.521604938271605]\n",
      "TIMING: dataset construction took 3.799 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 37.627 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 37.220 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.7407407407407407]\n",
      "TIMING: dataset construction took 4.253 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 37.222 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 36.973 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.6697530864197531]\n",
      "TIMING: dataset construction took 4.430 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 39.027 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 36.736 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.5555555555555556]\n",
      "TIMING: dataset construction took 4.439 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 39.070 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 40.478 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.7561728395061729]\n",
      "TIMING: dataset construction took 4.083 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 39.458 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 39.607 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.5740740740740741]\n",
      "TIMING: dataset construction took 4.591 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 36.404 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 36.156 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.6141975308641976]\n",
      "TIMING: dataset construction took 3.844 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 37.681 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 37.364 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.6666666666666667]\n",
      "TIMING: dataset construction took 3.748 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 37.870 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 38.210 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.7067901234567902]\n",
      "TIMING: dataset construction took 3.790 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 38.506 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 37.456 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.7283950617283951]\n",
      "TIMING: dataset construction took 4.056 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 39.208 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 37.989 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.5432098765432098]\n",
      "TIMING: dataset construction took 3.797 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 38.409 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 37.644 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.6697530864197532]\n",
      "0.1 {'auc std': 0.08782626914134467, 'auc mean': 0.6362654320987655}\n",
      "TIMING: dataset construction took 7.296 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 34.899 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 33.440 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.5864197530864197]\n",
      "TIMING: dataset construction took 8.158 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 33.130 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 32.725 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.7222222222222222]\n",
      "TIMING: dataset construction took 8.316 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 33.097 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 32.981 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.7037037037037037]\n",
      "TIMING: dataset construction took 8.043 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 34.308 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 33.295 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.7561728395061729]\n",
      "TIMING: dataset construction took 7.571 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 33.626 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 33.896 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.6265432098765432]\n",
      "TIMING: dataset construction took 7.990 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 33.674 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 33.379 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.7314814814814814]\n",
      "TIMING: dataset construction took 8.020 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 34.315 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 33.068 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.6604938271604939]\n",
      "TIMING: dataset construction took 7.624 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 33.518 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 33.438 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.595679012345679]\n",
      "TIMING: dataset construction took 7.808 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 33.099 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 33.065 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.7685185185185185]\n",
      "TIMING: dataset construction took 8.530 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 33.012 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 32.966 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.6049382716049383]\n",
      "TIMING: dataset construction took 8.342 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 32.796 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 33.794 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.5061728395061729]\n",
      "TIMING: dataset construction took 7.532 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 35.316 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 33.710 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.7191358024691359]\n",
      "TIMING: dataset construction took 7.579 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 33.517 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 33.447 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.5833333333333333]\n",
      "TIMING: dataset construction took 8.118 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 33.283 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 33.078 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.6481481481481481]\n",
      "TIMING: dataset construction took 8.895 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 34.012 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 32.388 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.4876543209876544]\n",
      "TIMING: dataset construction took 7.336 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 34.127 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 33.934 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.6635802469135803]\n",
      "TIMING: dataset construction took 8.511 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 32.474 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 32.328 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.7191358024691358]\n",
      "TIMING: dataset construction took 8.231 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 33.776 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 33.425 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.5802469135802469]\n",
      "TIMING: dataset construction took 7.504 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 33.728 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 35.353 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.595679012345679]\n",
      "TIMING: dataset construction took 8.227 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 32.723 s\n",
      "Loading dataset from disk.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "TIMING: dataset construction took 32.592 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.617283950617284]\n",
      "0.2 {'auc std': 0.07716913531857271, 'auc mean': 0.6438271604938273}\n",
      "TIMING: dataset construction took 16.732 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 24.272 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 24.716 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.4537037037037037]\n",
      "TIMING: dataset construction took 15.886 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 25.192 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 25.132 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.5740740740740741]\n",
      "TIMING: dataset construction took 16.026 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 24.997 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 24.861 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.7685185185185186]\n",
      "TIMING: dataset construction took 17.079 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 25.343 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 24.798 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.7037037037037037]\n",
      "TIMING: dataset construction took 16.112 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 25.557 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 24.961 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.7253086419753086]\n",
      "TIMING: dataset construction took 17.089 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 24.497 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 24.690 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.8765432098765432]\n",
      "TIMING: dataset construction took 17.014 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 26.276 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 25.067 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.6111111111111112]\n",
      "TIMING: dataset construction took 16.184 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 26.154 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 24.922 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.7191358024691358]\n",
      "TIMING: dataset construction took 16.114 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 25.361 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 25.245 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.8117283950617284]\n",
      "TIMING: dataset construction took 16.276 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 26.132 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 25.016 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.7253086419753086]\n",
      "TIMING: dataset construction took 16.541 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 25.869 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 24.690 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.9351851851851852]\n",
      "TIMING: dataset construction took 16.858 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 24.700 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 24.368 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.6234567901234568]\n",
      "TIMING: dataset construction took 16.624 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 26.882 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 24.771 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.5555555555555556]\n",
      "TIMING: dataset construction took 16.941 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 25.041 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 24.677 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.8333333333333333]\n",
      "TIMING: dataset construction took 15.567 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 25.649 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 25.470 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.7067901234567902]\n",
      "TIMING: dataset construction took 16.927 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 24.901 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 24.932 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.6697530864197532]\n",
      "TIMING: dataset construction took 16.716 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 24.639 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 25.207 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.7067901234567902]\n",
      "TIMING: dataset construction took 15.955 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 25.410 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 25.280 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.5277777777777777]\n",
      "TIMING: dataset construction took 16.322 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 25.079 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 25.522 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.7623456790123457]\n",
      "TIMING: dataset construction took 16.631 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 24.829 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 24.520 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.824074074074074]\n",
      "0.4 {'auc std': 0.11906651778595918, 'auc mean': 0.7057098765432099}\n",
      "TIMING: dataset construction took 33.680 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 7.666 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 7.429 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.8148148148148149]\n",
      "TIMING: dataset construction took 33.389 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 8.036 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 7.915 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.7006172839506173]\n",
      "TIMING: dataset construction took 33.793 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 8.533 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 8.381 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.867283950617284]\n",
      "TIMING: dataset construction took 33.571 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 8.582 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 8.510 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.7345679012345678]\n",
      "TIMING: dataset construction took 34.356 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 8.866 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 8.671 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.7345679012345678]\n",
      "TIMING: dataset construction took 36.090 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 8.091 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 7.851 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.5709876543209876]\n",
      "TIMING: dataset construction took 33.673 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 8.834 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 8.742 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.6759259259259258]\n",
      "TIMING: dataset construction took 34.925 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 7.872 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 7.687 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.7962962962962963]\n",
      "TIMING: dataset construction took 35.327 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 9.233 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 9.009 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.8240740740740741]\n",
      "TIMING: dataset construction took 32.753 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 9.161 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 8.964 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.7191358024691358]\n",
      "TIMING: dataset construction took 33.874 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 8.338 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 8.322 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.8919753086419753]\n",
      "TIMING: dataset construction took 33.200 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 9.177 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 8.922 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.6820987654320988]\n",
      "TIMING: dataset construction took 35.773 s\n",
      "Loading dataset from disk.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "TIMING: dataset construction took 8.555 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 8.085 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.5987654320987654]\n",
      "TIMING: dataset construction took 35.332 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 7.510 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 7.368 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.7314814814814814]\n",
      "TIMING: dataset construction took 33.929 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 8.334 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 8.216 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.6512345679012346]\n",
      "TIMING: dataset construction took 33.538 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 9.017 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 8.748 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.7345679012345678]\n",
      "TIMING: dataset construction took 33.991 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 9.511 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 9.009 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.617283950617284]\n",
      "TIMING: dataset construction took 35.724 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 7.756 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 7.869 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.8981481481481481]\n",
      "TIMING: dataset construction took 34.311 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 8.467 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 8.197 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.6975308641975309]\n",
      "TIMING: dataset construction took 35.026 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 8.120 s\n",
      "Loading dataset from disk.\n",
      "TIMING: dataset construction took 7.847 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.8055555555555556]\n",
      "0.8 {'auc std': 0.0914221288276763, 'auc mean': 0.7373456790123458}\n",
      "0.652094356261023\n"
     ]
    }
   ],
   "source": [
    "train_data = dataset.select(np.where(np.array(list(map(len, df['smiles'])))<=218)[0])\n",
    "test_data = dataset.select(np.where(np.array(list(map(len, df['smiles'])))>218)[0])\n",
    "\n",
    "scores = []\n",
    "for rate in rates:\n",
    "    score_dic = ablation_hiv_dc(train_data, test_data, rate, 20)\n",
    "    print(rate, score_dic)\n",
    "    scores.append(score_dic['auc mean'])\n",
    "print(np.mean(scores))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "TIMING: dataset construction took 42.882 s\n",
      "Loading dataset from disk.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/honda/anaconda3/envs/deepchem/lib/python3.5/site-packages/tensorflow/python/ops/gradients_impl.py:98: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n",
      "  \"Converting sparse IndexedSlices to a dense Tensor of unknown shape. \"\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computed_metrics: [0.7407407407407407]\n",
      "TIMING: dataset construction took 42.898 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.7438271604938271]\n",
      "TIMING: dataset construction took 43.568 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.5987654320987654]\n",
      "TIMING: dataset construction took 42.949 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.478395061728395]\n",
      "TIMING: dataset construction took 43.000 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.8148148148148149]\n",
      "TIMING: dataset construction took 45.167 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.7067901234567902]\n",
      "TIMING: dataset construction took 44.496 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.808641975308642]\n",
      "TIMING: dataset construction took 43.518 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.7438271604938271]\n",
      "TIMING: dataset construction took 43.286 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.8333333333333333]\n",
      "TIMING: dataset construction took 43.622 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.7006172839506173]\n",
      "TIMING: dataset construction took 42.693 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.9012345679012346]\n",
      "TIMING: dataset construction took 42.338 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.5987654320987654]\n",
      "TIMING: dataset construction took 43.477 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.8580246913580247]\n",
      "TIMING: dataset construction took 45.311 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.595679012345679]\n",
      "TIMING: dataset construction took 44.817 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.8148148148148148]\n",
      "TIMING: dataset construction took 43.116 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.7561728395061729]\n",
      "TIMING: dataset construction took 43.251 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.8209876543209877]\n",
      "TIMING: dataset construction took 42.695 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.7530864197530864]\n",
      "TIMING: dataset construction took 43.351 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.5401234567901234]\n",
      "TIMING: dataset construction took 43.081 s\n",
      "Loading dataset from disk.\n",
      "computed_metrics: [0.6759259259259258]\n",
      "0.8 {'auc std': 0.11010464515045111, 'auc mean': 0.7242283950617284}\n"
     ]
    }
   ],
   "source": [
    "score_dic = ablation_hiv_dc(train_data, test_data, 1, 20)\n",
    "print(rate, score_dic)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
